# -------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
# -------------------------------------------------------------

# Autogenerated By   : src/main/python/generator/generator.py
# Autogenerated From : scripts/builtin/dedup.dml

from typing import Dict, Iterable

from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
from systemds.utils.consts import VALID_INPUT_TYPES


def dedup(X: Frame,
          gloveMatrix: Matrix,
          vocab: Frame,
          **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
     Builtin for deduplication using distributed representations (DRs) and
     locality-sensitive hashing (LSH) based blocking.
    
     The function encodes each input tuple as a dense vector using pre-trained GloVe embeddings (simple averaging), 
     groups semantically similar tuples via LSH into buckets, and compares only those pairs for deduplication.
     
    
    
    
    :param X: Input Frame[String] with n rows and d columns (raw tuples)
    :param gloveMatrix: Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion
    :param vocab: Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix)
    :param similarityMeasure: (optional) String specifying similarity metric: "cosine", "euclidean"
    :param threshold: (optional) Double: threshold value above which tuples are considered duplicates
    :return: Frame[String] with deduplicated tuples
        (first occurrence of each duplicate group is retained)
    :return: Frame[String] with all detected duplicates
        (i.e., tuples removed from the input)
    """

    params_dict = {'X': X, 'gloveMatrix': gloveMatrix, 'vocab': vocab}
    params_dict.update(kwargs)
    
    vX_0 = Frame(X.sds_context, '')
    vX_1 = Frame(X.sds_context, '')
    output_nodes = [vX_0, vX_1, ]

    op = MultiReturn(X.sds_context, 'dedup', output_nodes, named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]

    return op
